meta <- read.csv('classification-outputs/museumstation_subset_classification.csv') %>%
as.tibble() %>%
mutate(session_id = paste('cdm_',session_ids,sep="")) %>%
mutate(age_numeric = ages) %>%
mutate(age = paste('age',ages,sep="")) %>%
mutate(category = target_classes) %>%
mutate(category_short = str_split_fixed(category," ",2)[,2]) %>%
mutate(image_name = paste(category,'_sketch_', age,'_', session_id,'.png',sep="")) %>%
select(-X) ## Warning: package 'bindrcpp' was built under R version 3.4.4
Now should be able to look at target label probabilties for each image as a function of all of these other factors
d <- read.csv('mongodb-output/MuseumStation_AllDescriptives_20780_images_final_cdm_run_v3.csv') %>%
as.tibble() %>%
left_join(meta) %>% # should join on session_id, category, age -- combination of which is unique identifier for an image
filter(!is.na(target_label_prob)) # if intermediate file, only look where we have data## Joining, by = c("session_id", "category", "age")
## Warning: Column `session_id` joining factor and character vector, coercing
## into character vector
## Warning: Column `category` joining factors with different levels, coercing
## to character vector
## Warning: Column `age` joining factor and character vector, coercing into
## character vector
## we only have meta data for those in cdm_run_v3, and we ran some classifications on cdm_run_v2 -- so mismatch here.
# test_meta <- meta %>%
# filter(!(session_id %in% mongodb_meta$session_id)) ggplot(d, aes(age_numeric,target_label_prob, col=image_scores)) +
theme_few() +
geom_jitter(alpha=.2, height=0, width=.3) +
geom_smooth(span=10, col='dark grey') +
# scale_color_viridis(option="B") +
theme(legend.position = "none") +
facet_wrap(~category) +
labs(y = 'Correct label probability', x = 'Age')## `geom_smooth()` using method = 'loess'
ggplot(d, aes(num_strokes,log(target_label_prob), color=image_scores)) +
geom_point(alpha=.5) +
theme_few() +
geom_smooth(method='lm') +
xlim(c(0,50)) +
facet_wrap(~category)## Warning: Removed 67 rows containing non-finite values (stat_smooth).
## Warning: Removed 67 rows containing missing values (geom_point).
ggplot(d, aes(mean_intensity,target_label_prob, color=image_scores)) +
geom_point(alpha=.5) +
theme_few() +
geom_smooth(method='lm') +
facet_wrap(~category)ggplot(d, aes(draw_duration_old,target_label_prob, color=image_scores, names = "recognized or not")) +
geom_point(alpha=.5) +
theme_few() +
geom_smooth(method='lm') +
facet_wrap(~category) +
labs(x='Time spent drawing', y='Log probability', color="correctly classified?")ggplot(d, aes(mean_intensity,num_strokes)) +
geom_point(alpha=.5) +
theme_few() +
geom_smooth(method='lm') +
facet_wrap(~age_numeric) +
ylim(c(0,50)) + ## some crazy outliers here, probably scribbles we didn't catch?
labs(x='Ink', y='Num strokes')## Warning: Removed 67 rows containing non-finite values (stat_smooth).
## Warning: Removed 67 rows containing missing values (geom_point).
cor_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_cor = mean(image_scores)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_cor")
(cor_by_age_plot = ggplot(cor_by_age, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few() +
labs(x='Age', y='Avg correct') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey',span=10))## `geom_smooth()` using method = 'loess'
ggsave('plots-nov26/cor_by_age.png',cor_by_age_plot, width = 6, height = 3)## `geom_smooth()` using method = 'loess'
prob_by_age <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_prob = mean(target_label_prob)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_prob")
(prob_by_age_plot = ggplot(prob_by_age, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few() +
labs(x='Age', y='Avg target probabilitiy') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey',span=10))## `geom_smooth()` using method = 'loess'
ggsave('plots-nov26/prob_by_age.png',prob_by_age_plot, width = 6, height = 3)## `geom_smooth()` using method = 'loess'
draw_duration <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_num_strokes = mean(num_strokes), avg_draw_duration = mean(draw_duration_old), avg_intensity = mean(mean_intensity)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_draw_duration") %>%
mutate(covariate = 'draw_duration')
num_strokes <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_num_strokes = mean(num_strokes), avg_draw_duration = mean(draw_duration_old), avg_intensity = mean(mean_intensity)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_num_strokes") %>%
mutate(covariate = 'num_strokes')
avg_intensity <- d %>%
group_by(age_numeric,category) %>%
summarize(avg_num_strokes = mean(num_strokes), avg_draw_duration = mean(draw_duration_old), avg_intensity = mean(mean_intensity)) %>%
group_by(age_numeric) %>%
multi_boot_standard(col = "avg_intensity") %>%
mutate(covariate = 'intensity')
###
p1=ggplot(draw_duration, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few() +
labs(x='Age', y='Draw duration') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey', span = 10)
p2=ggplot(avg_intensity, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few() +
labs(x='Age', y='Ink used') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey', span = 10)
p3=ggplot(num_strokes, aes(age_numeric,mean, color=age_numeric)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_few() +
labs(x='Age', y='Num Strokes') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
geom_smooth(col='grey', span = 10)
compiledPlot = ggarrange(cor_by_age_plot,p1,p2,p3, nrow = 1)## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
# dir.create('plots-nov26')
ggsave('plots-nov26/covariates_by_age.png',compiledPlot, width = 11, height = 4)category_by_age_scores <- d %>%
group_by(category,age_numeric) %>%
multi_boot_standard(col="image_scores")
(category_by_age_scores_plot <- ggplot(category_by_age_scores, aes(age_numeric,mean, col=age_numeric)) +
theme_few() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(span=10, col='dark grey') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
facet_wrap(~category) +
labs(y = 'Average classification accuracy', x = 'Age'))## `geom_smooth()` using method = 'loess'
ggsave('plots-nov26/category_by_age_scores.png', category_by_age_scores_plot, width = 10, height = 8 )## `geom_smooth()` using method = 'loess'
category_by_age_probs <- d %>%
group_by(category,age_numeric) %>%
multi_boot_standard(col="target_label_prob")
ggplot(category_by_age_probs, aes(age_numeric,mean, col=age_numeric)) +
theme_few() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(span=10, col='dark grey') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
facet_wrap(~category) +
labs(y = 'Correct label probability', x = 'Age')## `geom_smooth()` using method = 'loess'
category_by_age_probs_cor_only <- d %>%
filter(image_scores == 1) %>%
group_by(category,age_numeric) %>%
multi_boot_standard(col="target_label_prob")
category_by_age_probs_cor_only_plot <- ggplot(category_by_age_probs_cor_only, aes(age_numeric,mean, col=age_numeric)) +
theme_few() +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(span=10, col='dark grey') +
scale_color_viridis(option="D") +
theme(legend.position = "none") +
facet_wrap(~category) +
labs(y = 'Correct label probability', x = 'Age')
ggsave('plots-nov26/category_by_age_probs_corr_only.png', category_by_age_probs_cor_only_plot, width = 10, height = 8 )## `geom_smooth()` using method = 'loess'
mod_covariates <- glmer(image_scores ~ scale(age_numeric) +
scale(draw_duration_old) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d,
family = "binomial")
kable(summary(mod_covariates)$coef, digits = 3)| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -0.188 | 0.219 | -0.857 | 0.391 |
| scale(age_numeric) | 0.530 | 0.039 | 13.440 | 0.000 |
| scale(draw_duration_old) | 0.211 | 0.043 | 4.927 | 0.000 |
| scale(mean_intensity) | -0.059 | 0.041 | -1.444 | 0.149 |
| scale(num_strokes) | -0.284 | 0.088 | -3.233 | 0.001 |
predicted_df = data.frame(glmer_predictions = predict(mod_covariates, d), age=d$age_numeric, category=d$category)
(predicted_accuracy <- ggplot(data = predicted_df, aes(x=age, y=glmer_predictions, col=age)) +
scale_color_viridis() +
theme_few() +
geom_jitter(alpha=.1, width=.2, height=0) +
stat_smooth(col='grey', method='lm') +
theme(legend.position = "none") +
scale_x_discrete(limits=c(2,3,4,5,6,7,8,9,10)) +
labs(x = 'Age', y = 'Predicted accuracy of actual category'))ggsave('plots-nov26/predicted_accuracy.png',predicted_accuracy, width = 6, height = 4 )mod_covariates_prob <- glmer(target_label_prob ~ scale(age_numeric) +
scale(draw_duration_old) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d)## Warning in glmer(target_label_prob ~ scale(age_numeric) +
## scale(draw_duration_old) + : calling glmer() with family=gaussian (identity
## link) as a shortcut to lmer() is deprecated; please call lmer() directly
kable(summary(mod_covariates_prob)$coef, digits = 3)| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 0.090 | 0.003 | 31.212 |
| scale(age_numeric) | 0.007 | 0.000 | 17.255 |
| scale(draw_duration_old) | 0.003 | 0.000 | 7.221 |
| scale(mean_intensity) | -0.002 | 0.000 | -5.368 |
| scale(num_strokes) | -0.001 | 0.000 | -1.729 |
predicted_df = data.frame(glmer_predictions_prob = predict(mod_covariates_prob, d), age=d$age_numeric, category=d$category)
## Model prediction lines overlaid on raw data fed to the model
(predicted_probs <- ggplot(data = predicted_df, aes(x=age, y=glmer_predictions_prob, col=age)) +
scale_color_viridis() +
theme_few() +
geom_jitter(alpha=.1, width=.2, height=0) +
stat_smooth(col='grey', method='lm') +
theme(legend.position = "none") +
scale_x_discrete(limits=c(2,3,4,5,6,7,8,9,10)) +
labs(x = 'Age', y = 'Predicted probability of actual category'))ggsave('plots-nov26/predicted_probs.png',predicted_probs, width = 6, height = 4 )E.g., is there an increase in confidence independent of an increase in accuracy of the classifier?
d_correct <- d %>%
filter(image_scores == 1)
mod_covariates_correct_only <- glmer(target_label_prob ~ scale(age_numeric) +
scale(draw_duration_old) +
scale(mean_intensity) +
scale(num_strokes) +
(1|session_id) +
(1|category),
data = d_correct)## Warning in glmer(target_label_prob ~ scale(age_numeric) +
## scale(draw_duration_old) + : calling glmer() with family=gaussian (identity
## link) as a shortcut to lmer() is deprecated; please call lmer() directly
modelOut=summary(mod_covariates_correct_only)
kable(summary(mod_covariates_correct_only)$coef, digits = 3)| Estimate | Std. Error | t value | |
|---|---|---|---|
| (Intercept) | 0.109 | 0.003 | 34.680 |
| scale(age_numeric) | 0.003 | 0.000 | 8.271 |
| scale(draw_duration_old) | 0.001 | 0.000 | 2.923 |
| scale(mean_intensity) | -0.002 | 0.000 | -5.347 |
| scale(num_strokes) | -0.001 | 0.000 | -2.209 |
### Set parameters first
categories = unique(meta$category)
##
upper_thresholds=c(1,.85,.65,.45,.25,.05)
lower_thresholds=c(.95,.80,.60,.40,.20,0)
##
age_thres = 0
dir_name = 'subset_classification_examples_test'
dir.create(dir_name)## Warning in dir.create(dir_name): 'subset_classification_examples_test'
## already exists
##
for (this_category in categories){
dir.create(file.path(paste(dir_name,'/',this_category,sep="")))
thres_count=0
for (upper in upper_thresholds) {
thres_count = thres_count + 1
lower = lower_thresholds[thres_count]
subset <- meta %>%
filter(age_numeric > age_thres) %>%
group_by(category) %>%
mutate(upper_thres = quantile(target_label_prob, upper, na.rm = TRUE)) %>%
mutate(lower_thres = quantile(target_label_prob, lower, na.rm = TRUE)) %>%
filter(category == this_category) %>%
filter(target_label_prob > lower_thres & target_label_prob < upper_thres) %>%
sample_n(2) %>%
mutate(image_path = paste('srcd-features/museumstation_sketches/',category,'/',image_name,sep="")) %>%
mutate(new_image_path = paste(dir_name,'/',this_category,'/',
round(target_label_prob,4),image_name,sep=""))
file.copy(subset$image_path, subset$new_image_path)
}
}
### Make montages of these randomly sampled sketches for use in diagrams
dir.create(paste0(dir_name,'/montages/'))
for (this_category in categories){
image_read(dir(paste(dir_name, "/",this_category,sep=""), full.names = TRUE)) %>%
image_append(stack = FALSE) %>%
image_write(file.path(paste0(dir_name,"/montages/", this_category,".png")))
}##
age_thres = 4
dir_name = 'subset_classification_examples_ages4_10_correct_only'
dir.create(dir_name)
##
for (this_category in categories){
dir.create(file.path(paste(dir_name,'/',this_category,sep="")))
thres_count=0
for (upper in upper_thresholds) {
thres_count = thres_count + 1
lower = lower_thresholds[thres_count]
subset <- meta %>%
filter(age_numeric > age_thres) %>%
filter(image_scores==1 ) %>%
group_by(category) %>%
mutate(upper_thres = quantile(target_label_prob, upper, na.rm = TRUE)) %>%
mutate(lower_thres = quantile(target_label_prob, lower, na.rm = TRUE)) %>%
filter(category == this_category) %>%
filter(target_label_prob > lower_thres & target_label_prob < upper_thres) %>%
sample_n(2) %>%
mutate(image_path = paste('srcd-features/museumstation_sketches/',category,'/',image_name,sep="")) %>%
mutate(new_image_path = paste(dir_name,'/',this_category,'/',
round(target_label_prob,4),image_name,sep=""))
file.copy(subset$image_path, subset$new_image_path)
}
}
### Make montages of these randomly sampled sketches for use in diagrams
dir.create(paste0(dir_name,'/montages/'))
for (this_category in categories){
image_read(dir(paste(dir_name, "/",this_category,sep=""), full.names = TRUE)) %>%
image_append(stack = FALSE) %>%
image_write(file.path(paste0(dir_name,"/montages/", this_category,".png")))
}